knitr::opts_chunk$set(
echo = TRUE,
error = FALSE,
comment = "#>",
fig.path = "img/",
fig.retina = 2,
fig.width = 10,
fig.asp = 3/4,
fig.height = 20,
fig.pos = "t",
fig.align = "center",
dpi = 150,
out.width = "90%",
dev.args = list(png = list(type = "cairo-png")),
optipng = "-o1 -quiet"
)
library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr 1.1.2 ✔ readr 2.1.4
#> ✔ forcats 1.0.0 ✔ stringr 1.5.0
#> ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
#> ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
#> ✔ purrr 1.0.1
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(factoextra)
#> Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(scorecard)
#>
#> Attaching package: 'scorecard'
#>
#> The following object is masked from 'package:tidyr':
#>
#> replace_na
library(glmnet)
#> Loading required package: Matrix
#>
#> Attaching package: 'Matrix'
#>
#> The following objects are masked from 'package:tidyr':
#>
#> expand, pack, unpack
#>
#> Loaded glmnet 4.1-8
library(ggplot2)
library(plotly)
#>
#> Attaching package: 'plotly'
#>
#> The following object is masked from 'package:ggplot2':
#>
#> last_plot
#>
#> The following object is masked from 'package:stats':
#>
#> filter
#>
#> The following object is masked from 'package:graphics':
#>
#> layout
library(dplyr)
library(xefun)
library(modeest)
#> Registered S3 method overwritten by 'rmutil':
#> method from
#> print.response httr
library(cluster)
library(GA)
#> Loading required package: foreach
#>
#> Attaching package: 'foreach'
#>
#> The following objects are masked from 'package:purrr':
#>
#> accumulate, when
#>
#> Loading required package: iterators
#> Package 'GA' version 3.2.4
#> Type 'citation("GA")' for citing this R package in publications.
#>
#> Attaching package: 'GA'
#>
#> The following object is masked from 'package:utils':
#>
#> de
library(dendextend)
#>
#> ---------------------
#> Welcome to dendextend version 1.17.1
#> Type citation('dendextend') for how to cite the package.
#>
#> Type browseVignettes(package = 'dendextend') for the package vignette.
#> The github page is: https://github.com/talgalili/dendextend/
#>
#> Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
#> You may ask questions at stackoverflow, use the r and dendextend tags:
#> https://stackoverflow.com/questions/tagged/dendextend
#>
#> To suppress this message use: suppressPackageStartupMessages(library(dendextend))
#> ---------------------
#>
#>
#> Attaching package: 'dendextend'
#>
#> The following object is masked from 'package:stats':
#>
#> cutree
library(parallel)
library(ROCR)
library(gridExtra)
#>
#> Attaching package: 'gridExtra'
#>
#> The following object is masked from 'package:dplyr':
#>
#> combine
library(grid)
library(writexl)
library(openxlsx)
library(clusterSim)
#> Loading required package: MASS
#>
#> Attaching package: 'MASS'
#>
#> The following object is masked from 'package:plotly':
#>
#> select
#>
#> The following object is masked from 'package:dplyr':
#>
#> select
library(ROCR)
library(verification)
#> Loading required package: fields
#> Loading required package: spam
#> Spam version 2.9-1 (2022-08-07) is loaded.
#> Type 'help( Spam)' or 'demo( spam)' for a short introduction
#> and overview of this package.
#> Help for individual functions is also obtained by adding the
#> suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
#>
#> Attaching package: 'spam'
#>
#> The following object is masked from 'package:Matrix':
#>
#> det
#>
#> The following objects are masked from 'package:base':
#>
#> backsolve, forwardsolve
#>
#> Loading required package: viridisLite
#>
#> Try help(fields) to get started.
#>
#> Attaching package: 'fields'
#>
#> The following object is masked from 'package:xefun':
#>
#> ceiling2
#>
#> The following object is masked from 'package:scorecard':
#>
#> describe
#>
#> Loading required package: boot
#> Loading required package: CircStats
#> Loading required package: dtw
#> Loading required package: proxy
#>
#> Attaching package: 'proxy'
#>
#> The following object is masked from 'package:spam':
#>
#> as.matrix
#>
#> The following object is masked from 'package:Matrix':
#>
#> as.matrix
#>
#> The following objects are masked from 'package:stats':
#>
#> as.dist, dist
#>
#> The following object is masked from 'package:base':
#>
#> as.matrix
#>
#> Loaded dtw v1.23-1. See ?dtw for help, citation("dtw") for use in publication.
library(pROC)
#> Registered S3 method overwritten by 'pROC':
#> method from
#> lines.roc verification
#> Type 'citation("pROC")' for a citation.
#>
#> Attaching package: 'pROC'
#>
#> The following object is masked from 'package:verification':
#>
#> lines.roc
#>
#> The following objects are masked from 'package:stats':
#>
#> cov, smooth, var
library(xgboost)
#>
#> Attaching package: 'xgboost'
#>
#> The following object is masked from 'package:plotly':
#>
#> slice
#>
#> The following object is masked from 'package:dplyr':
#>
#> slice
library(Matrix)
library(dbscan)
#>
#> Attaching package: 'dbscan'
#>
#> The following object is masked from 'package:stats':
#>
#> as.dendrogram
library(knitr)
# Load the data
train_data <- read.csv("C:/Users/lenovo/Downloads/train.csv")
# Preview the data
str(train_data)
#> 'data.frame': 81738 obs. of 21 variables:
#> $ loan_amnt : int 5000 2400 5000 3000 5600 5375 6500 9000 3000 10000 ...
#> $ funded_amnt : int 5000 2400 5000 3000 5600 5375 6500 9000 3000 10000 ...
#> $ pymnt_plan : chr "n" "n" "n" "n" ...
#> $ grade : chr "B" "C" "A" "E" ...
#> $ sub_grade_num : num 0.4 1 0.8 0.2 0.4 1 0.6 0.2 0.2 0.4 ...
#> $ short_emp : int 0 0 0 0 0 1 0 1 0 0 ...
#> $ emp_length_num : int 11 11 4 10 5 1 6 1 4 4 ...
#> $ home_ownership : chr "RENT" "RENT" "RENT" "RENT" ...
#> $ dti : num 27.65 8.72 11.2 5.35 5.55 ...
#> $ purpose : chr "credit_card" "small_business" "wedding" "car" ...
#> $ payment_inc_ratio : num 8.14 8.26 5.22 2.74 4.57 ...
#> $ delinq_2yrs : int 0 0 0 0 0 0 0 0 0 0 ...
#> $ delinq_2yrs_zero : int 1 1 1 1 1 1 1 1 1 1 ...
#> $ inq_last_6mths : int 1 2 3 2 2 0 2 1 2 2 ...
#> $ last_delinq_none : int 1 1 1 1 1 1 1 1 1 1 ...
#> $ last_major_derog_none: int 1 1 1 1 1 1 1 1 1 1 ...
#> $ open_acc : int 3 2 9 4 11 2 14 4 11 14 ...
#> $ pub_rec : int 0 0 0 0 0 0 0 0 0 0 ...
#> $ pub_rec_zero : int 1 1 1 1 1 1 1 1 1 1 ...
#> $ revol_util : num 83.7 98.5 28.3 87.5 32.6 36.5 20.6 91.7 43.1 55.5 ...
#> $ bad_loans : int 0 0 0 0 1 1 0 1 0 1 ...
purpose_counts <- table(train_data$purpose)
print(purpose_counts)
#>
#> car credit_card debt_consolidation home_improvement
#> 1570 14722 45428 4990
#> house major_purchase medical moving
#> 665 2580 1085 792
#> other small_business vacation wedding
#> 6107 2173 587 1039
home_ownership_counts <- table(train_data$home_ownership)
print(home_ownership_counts)
#>
#> MORTGAGE OTHER OWN RENT
#> 39583 116 6639 35400
grade_counts <- table(train_data$grade)
print(grade_counts)
#>
#> A B C D E F G
#> 14812 24775 19928 12847 6022 2611 743
# Check for missing values and duplicates
sum(is.na(train_data))
#> [1] 97
# Convert categorical data to numeric
train_data <- train_data %>%
mutate(
grade = as.numeric(factor(grade), levels = c("A", "B", "C", "D", "E", "F" , "G")),
purpose = as.numeric(factor(purpose), levels = c("car", "credit_card", "debt_consolidation", "home_improvement", "house", "major_purchase","medical","moving", "other", "small_business", "vacation", "wedding")),
home_ownership = as.numeric(factor(home_ownership), levels = c("MORTGAGE", "OTHER", "OWN", "RENT"))
)
# Remove unnecessary columns
train_data <- dplyr::select(train_data, -pymnt_plan)
# Check for missing values and handle them
na_counts <- sapply(train_data, function(x) sum(is.na(x)))
print(na_counts)
#> loan_amnt funded_amnt grade
#> 0 0 0
#> sub_grade_num short_emp emp_length_num
#> 0 0 0
#> home_ownership dti purpose
#> 0 0 0
#> payment_inc_ratio delinq_2yrs delinq_2yrs_zero
#> 1 16 16
#> inq_last_6mths last_delinq_none last_major_derog_none
#> 16 0 0
#> open_acc pub_rec pub_rec_zero
#> 16 16 16
#> revol_util bad_loans
#> 0 0
set.seed(1)
total_rows <- nrow(train_data)
subset_size <- total_rows / 2
random_subset <- train_data %>%
sample_n(subset_size)
cat("Total rows:", total_rows, "\n")
#> Total rows: 81738
cat("Subset size:", subset_size, "\n")
#> Subset size: 40869
print(head(random_subset))
#> loan_amnt funded_amnt grade sub_grade_num short_emp emp_length_num
#> 1 12000 12000 3 1.0 0 4
#> 2 18550 18550 4 0.4 0 5
#> 3 10000 10000 4 0.6 0 5
#> 4 9600 9600 1 0.4 1 0
#> 5 12000 12000 1 0.4 0 11
#> 6 6000 6000 5 0.4 0 6
#> home_ownership dti purpose payment_inc_ratio delinq_2yrs delinq_2yrs_zero
#> 1 4 9.14 3 11.76200 0 1
#> 2 4 18.64 7 8.77438 0 1
#> 3 1 10.87 2 6.54269 0 1
#> 4 4 19.14 3 5.89520 1 0
#> 5 1 4.20 10 6.23880 0 1
#> 6 4 19.92 3 4.27303 0 1
#> inq_last_6mths last_delinq_none last_major_derog_none open_acc pub_rec
#> 1 5 0 1 15 1
#> 2 1 1 1 4 0
#> 3 3 1 1 8 0
#> 4 1 0 1 13 0
#> 5 0 1 1 14 0
#> 6 6 0 1 16 0
#> pub_rec_zero revol_util bad_loans
#> 1 0 63.5 0
#> 2 1 47.1 0
#> 3 1 50.8 0
#> 4 1 23.7 0
#> 5 1 6.0 0
#> 6 1 45.1 0
# dataframe
train_data1 <- random_subset
iv = iv(train_data1, y = 'bad_loans') %>%
as_tibble() %>%
mutate( info_value = round(info_value, 3) ) %>%
arrange( desc(info_value) )
iv %>%
knitr::kable()
| variable | info_value |
|---|---|
| dti | 0.485 |
| grade | 0.329 |
| revol_util | 0.252 |
| loan_amnt | 0.236 |
| funded_amnt | 0.234 |
| payment_inc_ratio | 0.081 |
| purpose | 0.042 |
| inq_last_6mths | 0.041 |
| home_ownership | 0.017 |
| emp_length_num | 0.010 |
| open_acc | 0.008 |
| short_emp | 0.004 |
| delinq_2yrs | 0.003 |
| pub_rec | 0.001 |
| delinq_2yrs_zero | 0.001 |
| sub_grade_num | 0.001 |
| last_delinq_none | 0.000 |
| last_major_derog_none | 0.000 |
| pub_rec_zero | 0.000 |
bins = woebin(train_data1, y = 'bad_loans')
#> ℹ Creating woe binning ...
#> ✔ Binning on 40869 rows and 20 columns in 00:00:12
variables <- names(bins[])
train_woe <- woebin_ply(train_data1, bins)
#> ℹ Converting into woe values ...
#> ✔ Woe transformating on 40869 rows and 19 columns in 00:00:11
df_train <- train_woe
# Remove unnecessary columns
df1 <- dplyr::select(df_train, -bad_loans)
# Perform PCA
data_pca <- prcomp(df1, center = TRUE, scale. = F)
summary(data_pca)
#> Importance of components:
#> PC1 PC2 PC3 PC4 PC5 PC6 PC7
#> Standard deviation 0.6260 0.3721 0.2820 0.22881 0.20052 0.17533 0.15299
#> Proportion of Variance 0.5004 0.1768 0.1016 0.06685 0.05135 0.03926 0.02989
#> Cumulative Proportion 0.5004 0.6772 0.7788 0.84562 0.89696 0.93622 0.96610
#> PC8 PC9 PC10 PC11 PC12 PC13 PC14
#> Standard deviation 0.12092 0.08631 0.04382 0.03642 0.02566 0.01404 0.01132
#> Proportion of Variance 0.01867 0.00951 0.00245 0.00169 0.00084 0.00025 0.00016
#> Cumulative Proportion 0.98478 0.99429 0.99674 0.99843 0.99927 0.99953 0.99969
#> PC15 PC16 PC17 PC18 PC19
#> Standard deviation 0.009821 0.009364 0.007662 0.0006635 2.419e-16
#> Proportion of Variance 0.000120 0.000110 0.000070 0.0000000 0.000e+00
#> Cumulative Proportion 0.999810 0.999920 1.000000 1.0000000 1.000e+00
loadings <- data_pca$rotation
head(loadings)
#> PC1 PC2 PC3 PC4
#> loan_amnt_woe 0.073925136 -0.253820895 0.063586182 -0.198899745
#> funded_amnt_woe 0.073912131 -0.251889280 0.062201727 -0.195292233
#> grade_woe 0.945146341 0.221287928 0.199161724 0.035117355
#> sub_grade_num_woe -0.005563003 -0.005599729 -0.009461251 -0.002620156
#> short_emp_woe -0.002213074 -0.001496193 0.005141298 0.004143282
#> emp_length_num_woe -0.002408340 -0.001046150 0.006117357 0.004535543
#> PC5 PC6 PC7 PC8
#> loan_amnt_woe -0.596521308 0.083046755 -0.070652246 0.16337237
#> funded_amnt_woe -0.585355665 0.082238273 -0.068350612 0.16071107
#> grade_woe 0.032423462 0.105600793 0.057545178 -0.03161873
#> sub_grade_num_woe -0.004397023 -0.013970272 -0.011040741 0.01266304
#> short_emp_woe 0.044973686 -0.008747212 -0.003109089 0.06880736
#> emp_length_num_woe 0.049823527 -0.009151991 -0.003684869 0.07775236
#> PC9 PC10 PC11 PC12
#> loan_amnt_woe -0.024335682 0.0044949026 -0.003463462 -0.0037211962
#> funded_amnt_woe -0.024055461 0.0030525030 0.001655281 0.0140260871
#> grade_woe -0.004505497 -0.0008711838 -0.011588331 -0.0116795853
#> sub_grade_num_woe 0.001316874 0.0092330898 0.056617537 -0.9974973244
#> short_emp_woe -0.693738384 0.0058779791 0.001623484 0.0007030922
#> emp_length_num_woe -0.709286624 0.0054178875 0.003796025 -0.0005814363
#> PC13 PC14 PC15 PC16
#> loan_amnt_woe 0.6992147436 -0.049363944 -0.0101610460 0.002800952
#> funded_amnt_woe -0.7113296330 0.047644996 0.0111328205 0.001980115
#> grade_woe 0.0006441983 0.004242974 0.0001074442 -0.002173667
#> sub_grade_num_woe -0.0106377005 0.025865070 0.0001360339 -0.014421276
#> short_emp_woe -0.0152838291 -0.074032288 -0.6886503038 -0.178334445
#> emp_length_num_woe 0.0148283926 0.067794163 0.6727614217 0.175270894
#> PC17 PC18 PC19
#> loan_amnt_woe -0.001259353 4.512122e-05 2.014199e-15
#> funded_amnt_woe -0.001106719 -4.082085e-05 -7.996344e-16
#> grade_woe 0.001585032 8.722092e-06 9.584531e-17
#> sub_grade_num_woe 0.007808105 1.586855e-05 3.545324e-16
#> short_emp_woe -0.006202354 -3.354520e-04 4.690120e-16
#> emp_length_num_woe 0.005881672 6.461365e-05 -5.546858e-16
fviz_eig(data_pca, addlabels = TRUE)
# Check the PCA output
head(data_pca$x)
#> PC1 PC2 PC3 PC4 PC5 PC6
#> [1,] 0.2921316 -0.19089166 0.1057117 -0.05820084 0.219797540 -0.412702689
#> [2,] 0.5134660 -0.03691616 0.2125867 0.11715191 -0.304435505 0.095277648
#> [3,] 0.5207891 0.18552810 0.1064171 -0.05423267 0.007266799 -0.377334274
#> [4,] -1.1263020 0.07704632 0.1638929 0.32977103 -0.098309680 0.007099912
#> [5,] -1.1771336 0.15374169 0.3223907 -0.14675212 -0.013589018 0.164693871
#> [6,] 0.8368147 0.58757265 0.2756430 0.36246019 0.045630945 -0.191133919
#> PC7 PC8 PC9 PC10 PC11
#> [1,] -0.05091001 0.09821871 0.06706506 0.028341563 -0.005671512
#> [2,] 0.18346330 0.20878863 0.03659958 0.059627400 -0.022679101
#> [3,] 0.19399641 -0.10711691 0.02778540 -0.003411623 -0.020516487
#> [4,] -0.06360999 0.19311549 -0.21201255 -0.055867561 0.097186839
#> [5,] -0.31250876 -0.15023069 0.02451762 0.028734360 -0.016472905
#> [6,] 0.01093339 0.11298191 0.05369666 -0.053209611 -0.023127403
#> PC12 PC13 PC14 PC15 PC16
#> [1,] -0.0210751094 -0.0009599073 -0.012285314 -0.008755016 0.023993526
#> [2,] 0.0045274794 -0.0013699203 0.008656917 -0.003202198 -0.002783650
#> [3,] -0.0077617866 0.0004323956 0.009952554 -0.002231300 -0.004361317
#> [4,] 0.0209509095 -0.0005126908 0.003215737 -0.003733679 -0.001526565
#> [5,] 0.0111541083 -0.0014094692 0.004725085 -0.002400555 -0.001721166
#> [6,] -0.0002443128 -0.0013473640 -0.003498016 -0.001031571 -0.009234172
#> PC17 PC18 PC19
#> [1,] -0.017875967 -2.174590e-05 2.755393e-16
#> [2,] 0.003341090 1.209072e-05 3.543329e-16
#> [3,] 0.004376651 3.914620e-05 -5.839496e-17
#> [4,] -0.007516465 -6.584101e-05 -2.862365e-16
#> [5,] 0.004050195 -5.141151e-05 -1.828037e-16
#> [6,] -0.011053840 4.810600e-05 -1.036341e-16
eig.val<-get_eigenvalue(data_pca)
eig.val
#> eigenvalue variance.percent cumulative.variance.percent
#> Dim.1 3.918482e-01 5.003707e+01 50.03707
#> Dim.2 1.384765e-01 1.768276e+01 67.71983
#> Dim.3 7.953661e-02 1.015643e+01 77.87626
#> Dim.4 5.235333e-02 6.685260e+00 84.56152
#> Dim.5 4.020963e-02 5.134570e+00 89.69609
#> Dim.6 3.074142e-02 3.925526e+00 93.62162
#> Dim.7 2.340642e-02 2.988883e+00 96.61050
#> Dim.8 1.462112e-02 1.867045e+00 98.47754
#> Dim.9 7.449615e-03 9.512788e-01 99.42882
#> Dim.10 1.919864e-03 2.451571e-01 99.67398
#> Dim.11 1.326062e-03 1.693315e-01 99.84331
#> Dim.12 6.586397e-04 8.410501e-02 99.92742
#> Dim.13 1.970018e-04 2.515616e-02 99.95257
#> Dim.14 1.281299e-04 1.636155e-02 99.96893
#> Dim.15 9.645838e-05 1.231726e-02 99.98125
#> Dim.16 8.767756e-05 1.119599e-02 99.99245
#> Dim.17 5.870826e-05 7.496753e-03 99.99994
#> Dim.18 4.402401e-07 5.621647e-05 100.00000
#> Dim.19 5.852888e-32 7.473848e-30 100.00000
# Determine the optimal number of clusters using the Elbow method on PCA components
data_pca_final<-prcomp(df1, center=FALSE, scale.=FALSE, rank. = 3)
results <- data_pca_final$x
set.seed(2)
wss1 <- function(k) {
kmeans(results, centers = k, iter.max = 100, nstart = 50)$tot.withinss
}
k.values <- 1:10
wss_values1 <- sapply(k.values, wss1)
wss_values1
#> [1] 24923.572 14400.046 10627.189 8140.346 6908.100 5934.548 5332.015
#> [8] 4739.680 4306.678 3945.009
# Plot the WSS values for each number of clusters
plot(k.values, wss_values1, type = 'b', xlab = 'Number of Clusters', ylab = 'Total Within-Cluster Sum of Squares', main = 'Elbow Method')
# Perform k-means clustering
set.seed(1)
km_res <- kmeans(results, centers = 3, nstart = 100)
fviz_cluster(km_res, data = results) +
scale_color_manual(values = c('steelblue3', 'sandybrown', '#9CDB9E')) +
scale_fill_manual(values = c('steelblue3', 'sandybrown', '#9CDB9E')) +
ggtitle("3 Cluster for result of pca") +
theme_minimal()
Customers_Segments <- data.frame(results, cluster = as.factor(km_res$cluster))
km_res$size
#> [1] 7309 19836 13724
km_res$centers
#> PC1 PC2 PC3
#> 1 -1.1658261 -0.08222403 -0.007465017
#> 2 -0.1858829 0.09897490 -0.011609765
#> 3 0.5382256 -0.04571766 0.024009623
df1$groupkm <- km_res$cluster
g1<- df1[df1$groupkm==1,]
g2<- df1[df1$groupkm==2,]
g3<- df1[df1$groupkm==2,]
set.seed(8)
objective_function <- function(params) {
k <- round(params)
if (k < 2) k <- 2
return(-calculate_wss(k, results))
}
calculate_wss <- function(k, results) {
kmeans_result <- kmeans(results, centers = k, nstart = 100)
return(kmeans_result$tot.withinss)
}
ga_result <- ga(
type = "real-valued",
fitness = objective_function,
lower = 1,
upper = 10,
run = 10,
parallel = TRUE,
monitor = if(interactive()) gaMonitor else FALSE
)
best_k <- round(ga_result@solution[1])
optimal_kmeans <- kmeans(results, centers = best_k, nstart = 100)
fviz_cluster(optimal_kmeans, data = results)+
ggtitle("GACluster for result of pca") +
theme_minimal()
## Realization of clusters
df1$group_GA <-optimal_kmeans$cluster
g1_GA<- df1[df1$group_GA==1,]
g2_GA<- df1[df1$group_GA==2,]
g3_GA<- df1[df1$group_GA==3,]
g4_GA<- df1[df1$group_GA==4,]
g5_GA<- df1[df1$group_GA==5,]
# Create summary table for each cluster
cluster_summary <- df1 %>%
group_by(group_GA) %>%
summarise(
count = n(),
loan_amnt_min = min(loan_amnt_woe),
loan_amnt_max = max(loan_amnt_woe),
loan_amnt_mean = mean(loan_amnt_woe),
funded_amnt_min = min(funded_amnt_woe),
funded_amnt_max = max(funded_amnt_woe),
funded_amnt_mean = mean(funded_amnt_woe),
sub_grade_num_min = min(sub_grade_num_woe),
sub_grade_num_max = max(sub_grade_num_woe),
emp_length_num_min = min(emp_length_num_woe),
emp_length_num_max = max(emp_length_num_woe),
dti_min = min(dti_woe),
dti_max = max(dti_woe),
dti_mean = mean(dti_woe),
revol_util_min = min(revol_util_woe),
revol_util_max = max(revol_util_woe),
revol_util_mean = mean(revol_util_woe),
)
# Display the summary
cluster_summary <- t(cluster_summary)
print(cluster_summary)
#> [,1] [,2] [,3] [,4]
#> group_GA 1.000000e+00 2.00000000 3.00000000 4.00000000
#> count 2.754000e+03 4114.00000000 3930.00000000 3020.00000000
#> loan_amnt_min -1.783359e-01 -0.17833591 -0.17833591 -0.17833591
#> loan_amnt_max 2.374314e-01 0.23743139 0.23743139 0.23743139
#> loan_amnt_mean 2.808564e-02 -0.05250190 -0.05468618 -0.07178379
#> funded_amnt_min -1.741437e-01 -0.17414371 -0.17414371 -0.17414371
#> funded_amnt_max 2.367068e-01 0.23670680 0.23670680 0.23670680
#> funded_amnt_mean 2.642411e-02 -0.05151174 -0.05440339 -0.07256304
#> sub_grade_num_min -4.653398e-02 -0.04653398 -0.04653398 -0.04653398
#> sub_grade_num_max 2.524326e-02 0.02524326 0.02524326 0.02524326
#> emp_length_num_min -2.886748e-02 -0.02886748 -0.02886748 -0.02886748
#> emp_length_num_max 1.619973e-01 0.16199733 0.16199733 0.16199733
#> dti_min -3.666917e-01 -0.36669167 -0.36669167 -0.36669167
#> dti_max 5.429565e-01 0.54295647 0.54295647 0.54295647
#> dti_mean -7.408257e-03 -0.12762850 -0.14090143 -0.01885170
#> revol_util_min -5.472870e-01 -0.54728702 -0.54728702 -0.54728702
#> revol_util_max 4.181434e-01 0.06930215 0.06930215 0.41814344
#> revol_util_mean 1.352461e-01 -0.29891951 -0.33697117 -0.02194559
#> [,5] [,6] [,7] [,8]
#> group_GA 5.00000000 6.00000000 7.00000000 8.00000000
#> count 5555.00000000 1314.00000000 3759.00000000 7801.00000000
#> loan_amnt_min -0.17833591 -0.17833591 -0.17833591 -0.17833591
#> loan_amnt_max 0.23743139 0.23743139 0.23743139 0.23743139
#> loan_amnt_mean 0.08406479 0.09077796 0.13944395 -0.04227538
#> funded_amnt_min -0.17414371 -0.17414371 -0.17414371 -0.17414371
#> funded_amnt_max 0.23670680 0.23670680 0.23670680 0.23670680
#> funded_amnt_mean 0.08422553 0.09081283 0.13933902 -0.04113021
#> sub_grade_num_min -0.04653398 -0.04653398 -0.04653398 -0.04653398
#> sub_grade_num_max 0.02524326 0.02524326 0.02524326 0.02524326
#> emp_length_num_min -0.02886748 -0.02886748 -0.02886748 -0.02886748
#> emp_length_num_max 0.16199733 0.16199733 0.16199733 0.16199733
#> dti_min -0.36669167 -0.36669167 -0.36669167 -0.36669167
#> dti_max 0.54295647 0.54295647 0.54295647 0.54295647
#> dti_mean 0.06808741 -0.01370626 0.09041298 0.02192296
#> revol_util_min -0.54728702 -0.54728702 -0.54728702 -0.18215209
#> revol_util_max 0.41814344 0.41814344 0.41814344 0.41814344
#> revol_util_mean 0.02208711 -0.23495464 0.11881948 0.20019597
#> [,9] [,10]
#> group_GA 9.00000000 10.00000000
#> count 2975.00000000 5647.00000000
#> loan_amnt_min -0.17833591 -0.17833591
#> loan_amnt_max 0.23743139 0.23743139
#> loan_amnt_mean -0.09045170 -0.05363854
#> funded_amnt_min -0.17414371 -0.17414371
#> funded_amnt_max 0.23670680 0.23670680
#> funded_amnt_mean -0.09113556 -0.05291467
#> sub_grade_num_min -0.04653398 -0.04653398
#> sub_grade_num_max 0.02524326 0.02524326
#> emp_length_num_min -0.02886748 -0.02886748
#> emp_length_num_max 0.16199733 0.16199733
#> dti_min -0.36669167 -0.36669167
#> dti_max 0.54295647 0.54295647
#> dti_mean -0.18348016 0.01412957
#> revol_util_min -0.54728702 -0.18215209
#> revol_util_max -0.18215209 0.41814344
#> revol_util_mean -0.50297989 0.14052850
cluster_summary_df <- as.data.frame(cluster_summary)
###table
#grade
grade_summary <- df1 %>%
group_by(group_GA, grade_woe) %>%
summarise(count = n(), .groups = 'drop') %>%
pivot_wider(names_from = grade_woe, values_from = count, values_fill = list(count = 0))
kable(grade_summary, caption = "Counts of 'grade_woe' by Group")
| group_GA | 0.860425570044054 | 0.103685264830283 | 0.421220703081032 | -0.319075617367842 | -1.12236611095412 |
|---|---|---|---|---|---|
| 1 | 2754 | 0 | 0 | 0 | 0 |
| 2 | 166 | 2561 | 1387 | 0 | 0 |
| 3 | 0 | 0 | 0 | 3930 | 0 |
| 4 | 0 | 0 | 0 | 0 | 3020 |
| 5 | 0 | 2762 | 0 | 2793 | 0 |
| 6 | 0 | 0 | 0 | 0 | 1314 |
| 7 | 1787 | 0 | 1972 | 0 | 0 |
| 8 | 0 | 4746 | 3055 | 0 | 0 |
| 9 | 0 | 0 | 0 | 0 | 2975 |
| 10 | 0 | 0 | 0 | 5647 | 0 |
#short_emp
short_emp_summary <- df1 %>%
group_by(group_GA, short_emp_woe) %>%
summarise(count = n(), .groups = 'drop') %>%
pivot_wider(names_from = short_emp_woe, values_from = count, values_fill = list(count = 0))
kable(short_emp_summary, caption = "Counts of 'short_emp_woe' by Group")
| group_GA | -0.0244209293279626 | 0.161997334092502 |
|---|---|---|
| 1 | 2466 | 288 |
| 2 | 3571 | 543 |
| 3 | 3411 | 519 |
| 4 | 2648 | 372 |
| 5 | 4835 | 720 |
| 6 | 1103 | 211 |
| 7 | 3269 | 490 |
| 8 | 6910 | 891 |
| 9 | 2543 | 432 |
| 10 | 5017 | 630 |
#purpose
purpose_summary <- df1 %>%
group_by(group_GA, purpose_woe) %>%
summarise(count = n(), .groups = 'drop') %>%
pivot_wider(names_from = purpose_woe, values_from = count, values_fill = list(count = 0))
kable(purpose_summary, caption = "Counts of 'purpose_woe' by Group")
| group_GA | -0.196831871058625 | -0.196608100580901 | -0.160680606700238 | 0.0273586138416367 | 0.315718834677194 |
|---|---|---|---|---|---|
| 1 | 305 | 184 | 182 | 1505 | 578 |
| 2 | 493 | 309 | 409 | 2025 | 878 |
| 3 | 710 | 318 | 400 | 1923 | 579 |
| 4 | 871 | 181 | 175 | 1521 | 272 |
| 5 | 1246 | 112 | 252 | 3622 | 323 |
| 6 | 297 | 61 | 81 | 753 | 122 |
| 7 | 592 | 98 | 156 | 2532 | 381 |
| 8 | 1607 | 405 | 404 | 4486 | 899 |
| 9 | 518 | 389 | 427 | 1153 | 488 |
| 10 | 1489 | 213 | 296 | 3204 | 445 |
#delinq_2yrs
delinq_2yrs_summary <- df1 %>%
group_by(group_GA, delinq_2yrs_woe) %>%
summarise(count = n(), .groups = 'drop') %>%
pivot_wider(names_from = delinq_2yrs_woe, values_from = count, values_fill = list(count = 0))
kable(delinq_2yrs_summary, caption = "Counts of 'delinq_2yrs_woe' by Group")
| group_GA | -0.0108046746319754 | 0.0639753139202767 |
|---|---|---|
| 1 | 2176 | 578 |
| 2 | 3132 | 982 |
| 3 | 3325 | 605 |
| 4 | 2831 | 189 |
| 5 | 4981 | 574 |
| 6 | 1253 | 61 |
| 7 | 3117 | 642 |
| 8 | 6550 | 1251 |
| 9 | 2776 | 199 |
| 10 | 4938 | 709 |
kable(grade_summary, caption = "Counts of 'grade_woe' by Group")
| group_GA | 0.860425570044054 | 0.103685264830283 | 0.421220703081032 | -0.319075617367842 | -1.12236611095412 |
|---|---|---|---|---|---|
| 1 | 2754 | 0 | 0 | 0 | 0 |
| 2 | 166 | 2561 | 1387 | 0 | 0 |
| 3 | 0 | 0 | 0 | 3930 | 0 |
| 4 | 0 | 0 | 0 | 0 | 3020 |
| 5 | 0 | 2762 | 0 | 2793 | 0 |
| 6 | 0 | 0 | 0 | 0 | 1314 |
| 7 | 1787 | 0 | 1972 | 0 | 0 |
| 8 | 0 | 4746 | 3055 | 0 | 0 |
| 9 | 0 | 0 | 0 | 0 | 2975 |
| 10 | 0 | 0 | 0 | 5647 | 0 |
kable(short_emp_summary, caption = "Counts of 'short_emp_woe' by Group")
| group_GA | -0.0244209293279626 | 0.161997334092502 |
|---|---|---|
| 1 | 2466 | 288 |
| 2 | 3571 | 543 |
| 3 | 3411 | 519 |
| 4 | 2648 | 372 |
| 5 | 4835 | 720 |
| 6 | 1103 | 211 |
| 7 | 3269 | 490 |
| 8 | 6910 | 891 |
| 9 | 2543 | 432 |
| 10 | 5017 | 630 |
kable(purpose_summary, caption = "Counts of 'purpose_woe' by Group")
| group_GA | -0.196831871058625 | -0.196608100580901 | -0.160680606700238 | 0.0273586138416367 | 0.315718834677194 |
|---|---|---|---|---|---|
| 1 | 305 | 184 | 182 | 1505 | 578 |
| 2 | 493 | 309 | 409 | 2025 | 878 |
| 3 | 710 | 318 | 400 | 1923 | 579 |
| 4 | 871 | 181 | 175 | 1521 | 272 |
| 5 | 1246 | 112 | 252 | 3622 | 323 |
| 6 | 297 | 61 | 81 | 753 | 122 |
| 7 | 592 | 98 | 156 | 2532 | 381 |
| 8 | 1607 | 405 | 404 | 4486 | 899 |
| 9 | 518 | 389 | 427 | 1153 | 488 |
| 10 | 1489 | 213 | 296 | 3204 | 445 |
kable(delinq_2yrs_summary, caption = "Counts of 'delinq_2yrs_woe' by Group")
| group_GA | -0.0108046746319754 | 0.0639753139202767 |
|---|---|---|
| 1 | 2176 | 578 |
| 2 | 3132 | 982 |
| 3 | 3325 | 605 |
| 4 | 2831 | 189 |
| 5 | 4981 | 574 |
| 6 | 1253 | 61 |
| 7 | 3117 | 642 |
| 8 | 6550 | 1251 |
| 9 | 2776 | 199 |
| 10 | 4938 | 709 |
# agglomative
# Dissimilarity matrix
df3 <- data.frame(results)
set.seed(9)
df4 <- df3 %>%
sample_n(100)
d <- dist(df4, method = "euclidean")
# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "ward.D" )
hc2 <- hclust(d, method = "complete" )
fviz_nbclust(df4, FUN = hcut, method = "wss")
gap_stat <- clusGap(df4, FUN = hcut, nstart = 50, K.max = 10, B = 50)
fviz_gap_stat(gap_stat)
# Plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1)
rect.hclust(hc1, k = 6, border = 2:5)
plot(hc2, cex = 0.6, hang = -1)
rect.hclust(hc2, k = 6, border = 2:5)
# Cut tree into 3 groups
sub_grp1 <- cutree(hc1, k = 6)
sub_grp2 <- cutree(hc2, k = 6)
# Number of members in each cluster
table(sub_grp1)
#> sub_grp1
#> 1 2 3 4 5 6
#> 19 7 28 9 18 19
table(sub_grp2)
#> sub_grp2
#> 1 2 3 4 5 6
#> 34 7 24 10 17 8
fviz_cluster(list(data = df4, cluster = sub_grp1))+
ggtitle("HCluster for Ward.D") +
theme_minimal()
fviz_cluster(list(data = df4, cluster = sub_grp2))+
ggtitle("HCluster for Complete") +
theme_minimal()
# Create two dendrograms
dend1 <- as.dendrogram (hc1)
dend2 <- as.dendrogram (hc2)
tanglegram(dend1, dend2)
dend_list <- dendlist(dend1, dend2)
tanglegram(dend1, dend2,
highlight_distinct_edges = FALSE,
common_subtrees_color_lines = FALSE,
common_subtrees_color_branches = TRUE,
main = paste("entanglement =", round(entanglement(dend_list), 2))
)
# divisive
# compute divisive hierarchical clustering
hc3 <- diana(df4)
# Divise coefficient; amount of clustering structure found
hc3$dc
#> [1] 0.9243702
# plot dendrogram
pltree(hc3, cex = 0.6, hang = -1, main = "Dendrogram of diana")
set.seed(10)
# Perform DBSCAN clustering
dbscan_result <- dbscan(results, eps = 0.5, minPts = 5)
# Add cluster assignments to the original data
df1$groupdb <- as.factor(dbscan_result$cluster)
# Visualize the clusters
fviz_cluster(dbscan_result, data = results)+
ggtitle("dbscanCluster for result of pca") +
theme_minimal()
# Libraries
davies_bouldin_kmeans <- index.DB(results, km_res$cluster, centrotypes="centroids")
# Silhouette and Davies-Bouldin for DBSCAN
davies_bouldin_dbscan <- index.DB(results, dbscan_result$cluster, centrotypes="centroids")
# Silhouette and Davies-Bouldin for GA_kmeans
davies_bouldin_GA <- index.DB(results, optimal_kmeans$cluster, centrotypes="centroids")
# Davies-Bouldin Index values
davies_bouldin_kmeans_score <- davies_bouldin_kmeans$DB
davies_bouldin_dbscan_score <- davies_bouldin_dbscan$DB
davies_bouldin_GA_score <- davies_bouldin_GA$DB
# Create a data frame to store the results
comparison_table <- data.frame(
Method = c( "kmeans" , "DBSCAN" , "GA"),
Davies_Bouldin_Index = c(davies_bouldin_kmeans_score, davies_bouldin_dbscan_score, davies_bouldin_GA_score)
)
# Display the table
knitr::kable(comparison_table, caption = "Comparison of Clustering Methods")
| Method | Davies_Bouldin_Index |
|---|---|
| kmeans | 1.2668721 |
| DBSCAN | 0.8394707 |
| GA | 1.0132065 |
set.seed(00)
index <- sample(2,size = nrow(df_train), replace = T , prob=c(0.7,0.3))
train <- df_train[index == 1, ]
test <- df_train[index == 2, ]
logit1 <- glm(formula = bad_loans ~ ., family = stats::binomial("logit"), data = train)
summary(logit1)
#>
#> Call:
#> glm(formula = bad_loans ~ ., family = stats::binomial("logit"),
#> data = train)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -1.4443 -0.6890 -0.5277 -0.3346 2.5906
#>
#> Coefficients: (1 not defined because of singularities)
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -1.44999 0.01837 -78.929 < 2e-16 ***
#> loan_amnt_woe 1.94050 0.74046 2.621 0.008776 **
#> funded_amnt_woe -2.25716 0.75274 -2.999 0.002712 **
#> grade_woe 0.85637 0.03408 25.131 < 2e-16 ***
#> sub_grade_num_woe 3.22561 0.61119 5.278 1.31e-07 ***
#> short_emp_woe -0.13674 1.13185 -0.121 0.903843
#> emp_length_num_woe 1.10872 1.10840 1.000 0.317167
#> home_ownership_woe 0.43130 0.12808 3.368 0.000758 ***
#> dti_woe 0.56028 0.06416 8.733 < 2e-16 ***
#> purpose_woe 0.73032 0.10246 7.128 1.02e-12 ***
#> payment_inc_ratio_woe 0.77801 0.04835 16.090 < 2e-16 ***
#> delinq_2yrs_woe 132.46073 1301.84444 0.102 0.918956
#> delinq_2yrs_zero_woe -135.01281 1328.62828 -0.102 0.919060
#> inq_last_6mths_woe 0.69061 0.08585 8.045 8.65e-16 ***
#> last_delinq_none_woe -3.16821 1.72839 -1.833 0.066796 .
#> last_major_derog_none_woe 3.83650 1.71383 2.239 0.025185 *
#> open_acc_woe 0.95787 0.35473 2.700 0.006929 **
#> pub_rec_woe 6.16589 2.47378 2.493 0.012685 *
#> pub_rec_zero_woe NA NA NA NA
#> revol_util_woe 0.33192 0.06034 5.501 3.78e-08 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 27928 on 28621 degrees of freedom
#> Residual deviance: 25795 on 28603 degrees of freedom
#> AIC: 25833
#>
#> Number of Fisher Scoring iterations: 10
# Predictions on training and test data
train_pred1 <- predict(logit1, df_train, type = 'response')
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
test_pred1 <- predict(logit1, test, type = 'response')
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
test$test_pred1 <- test_pred1
df_train$train_pred1 <- train_pred1
# Optimal cutoff determination
pred1 <- prediction(train_pred1, df_train$bad_loans)
perform1 <- performance(pred1, "acc")
max1 <- which.max(slot(perform1, "y.values")[[1]])
prob1 <- slot(perform1, "x.values")[[1]][max1]
prob1
#> 6127
#> 0.4876197
# AUC calculation for training data
auc_bin1 <- performance(pred1, "auc")
auc_bin1 <- unlist(slot(auc_bin1, "y.values"))
auc_bin1
#> [1] 0.6906972
# Confusion Matrix - Training Data
train_pred_class <- ifelse(train_pred1 > prob1, 1, 0)
tble_cf <- table(Predicted = train_pred_class, Actual = df_train$bad_loans)
tble_cf
#> Actual
#> Predicted 0 1
#> 0 32805 7531
#> 1 253 280
# Classification Table - Training Data
TP <- tble_cf["1", "1"]
FP <- tble_cf["1", "0"]
FN <- tble_cf["0", "1"]
TN <- tble_cf["0", "0"]
Specificity <- TN / (TN + FP)
Sensitivity <- TP / (TP + FN)
Precision <- TP / (TP + FP)
classification_metrics_train <- data.frame(Specificity, Sensitivity, Precision)
classification_metrics_train
#> Specificity Sensitivity Precision
#> 1 0.9923468 0.03584688 0.5253283
# Misclassification Error - Training Data
misclassification_error_train <- 1 - sum(diag(tble_cf)) / sum(tble_cf)
misclassification_error_train
#> [1] 0.1904622
# ROC and AUC - Training Data
roc_train <- roc(df_train$bad_loans, train_pred1)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
roc_test <- roc(test$bad_loans, test_pred1)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
# Plot ROC Curve
ggroc(roc_train) +
ggtitle("ROC Curve for Training Data") +
xlab("1 - Specificity") +
ylab("Sensitivity")
ggroc(roc_test) +
ggtitle("ROC Curve for Test Data") +
xlab("1 - Specificity") +
ylab("Sensitivity")
# AUC Calculation
auc_train <- auc(roc_train)
auc_test <- auc(roc_test)
auc_train
#> Area under the curve: 0.6907
auc_test
#> Area under the curve: 0.6836
df_train <- dplyr::select(df_train, -train_pred1)
df_train$groupkm <- km_res$cluster
segment1<- df_train[df_train$groupkm==1,]
segment1 <- segment1[,1:21]
set.seed(1111)
index1 <- sample(2,size = nrow(segment1), replace = T , prob=c(0.7,0.3))
train1 <- segment1[index1 == 1, ]
test1 <- segment1[index1 == 2, ]
logit_Seg1 <- glm(formula = bad_loans ~ ., family = stats::binomial("logit"), data = train1)
summary(logit_Seg1)
#>
#> Call:
#> glm(formula = bad_loans ~ ., family = stats::binomial("logit"),
#> data = train1)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -0.8684 -0.4241 -0.3519 -0.2834 2.7032
#>
#> Coefficients: (3 not defined because of singularities)
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -2.2293 0.1070 -20.841 < 2e-16 ***
#> loan_amnt_woe -4.5146 3.8721 -1.166 0.24364
#> funded_amnt_woe 3.3370 3.9311 0.849 0.39595
#> grade_woe NA NA NA NA
#> sub_grade_num_woe 5.2655 2.7570 1.910 0.05615 .
#> short_emp_woe -1.4831 3.8473 -0.385 0.69988
#> emp_length_num_woe 1.7646 3.7840 0.466 0.64098
#> home_ownership_woe 0.4044 0.4517 0.895 0.37062
#> dti_woe 0.4234 0.2421 1.748 0.08038 .
#> purpose_woe 1.1245 0.3354 3.353 0.00080 ***
#> payment_inc_ratio_woe 1.0673 0.1773 6.020 1.75e-09 ***
#> delinq_2yrs_woe 143.0207 4342.6574 0.033 0.97373
#> delinq_2yrs_zero_woe -143.0564 4432.0016 -0.032 0.97425
#> inq_last_6mths_woe 0.8947 0.3281 2.727 0.00639 **
#> last_delinq_none_woe 8.3971 6.8792 1.221 0.22222
#> last_major_derog_none_woe 3.6373 9.8150 0.371 0.71095
#> open_acc_woe 0.5689 1.2771 0.445 0.65598
#> pub_rec_woe 19.5928 20.9738 0.934 0.35022
#> pub_rec_zero_woe NA NA NA NA
#> revol_util_woe 0.8619 0.2135 4.038 5.39e-05 ***
#> groupkm NA NA NA NA
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 2635.8 on 5047 degrees of freedom
#> Residual deviance: 2531.2 on 5030 degrees of freedom
#> AIC: 2567.2
#>
#> Number of Fisher Scoring iterations: 11
# Predictions on training and test data
trainpred_seg1 <- predict(logit_Seg1, train1, type = 'response')
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
testpred_seg1 <- predict(logit_Seg1, test1, type = 'response')
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
test1$testpred_seg1 <- testpred_seg1
train1$trainpred_seg1 <- trainpred_seg1
# Optimal cutoff determination
pred1_seg <- prediction(trainpred_seg1, train1$bad_loans)
performseg1 <- performance(pred1_seg, "acc")
max1 <- which.max(slot(performseg1, "y.values")[[1]])
probseg1 <- slot(performseg1, "x.values")[[1]][max1]
probseg1
#>
#> Inf
# AUC calculation for training data
auc_binseg1 <- performance(pred1_seg, "auc")
auc_binseg1 <- unlist(slot(auc_binseg1, "y.values"))
auc_binseg1
#> [1] 0.6562001
# Confusion Matrix - Training Data
trainseg_pred_class <- ifelse(trainpred_seg1 > prob1, 1, 0)
tble_trainseg <- table(Predicted = trainseg_pred_class, Actual = train1$bad_loans)
tble_trainseg
#> Actual
#> Predicted 0 1
#> 0 4680 368
# ROC and AUC - Training Data
roc_trainseg <- roc(train1$bad_loans, trainpred_seg1)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
roc_testseg <- roc(test1$bad_loans, testpred_seg1)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
# Plot ROC Curve
ggroc(roc_trainseg) +
ggtitle("ROC Curve for Trainingseg Data") +
xlab("1 - Specificityseg") +
ylab("Sensitivityseg")
ggroc(roc_testseg) +
ggtitle("ROC Curve for Testseg Data") +
xlab("1 - Specificityseg") +
ylab("Sensitivityseg")
# AUC Calculation
auc_trainseg <- auc(roc_trainseg)
auc_testseg <- auc(roc_testseg)
auc_trainseg
#> Area under the curve: 0.6562
auc_testseg
#> Area under the curve: 0.6175
hist(trainpred_seg1)
segment2<- df_train[df_train$groupkm==2,]
segment2 <- segment2[,1:21]
set.seed(2222)
index2 <- sample(2,size = nrow(segment2), replace = T , prob=c(0.7,0.3))
train2 <- segment2[index2 == 1, ]
test2 <- segment2[index2 == 2, ]
logit_Seg2 <- glm(formula = bad_loans ~ ., family = stats::binomial("logit"), data = train2)
summary(logit_Seg2)
#>
#> Call:
#> glm(formula = bad_loans ~ ., family = stats::binomial("logit"),
#> data = train2)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -1.0589 -0.6174 -0.5347 -0.4456 2.4432
#>
#> Coefficients: (2 not defined because of singularities)
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -1.48995 0.03425 -43.499 < 2e-16 ***
#> loan_amnt_woe 2.65206 1.06644 2.487 0.012888 *
#> funded_amnt_woe -3.03155 1.08883 -2.784 0.005366 **
#> grade_woe 0.74485 0.11718 6.356 2.07e-10 ***
#> sub_grade_num_woe 4.11972 0.96288 4.279 1.88e-05 ***
#> short_emp_woe 0.89722 1.69348 0.530 0.596247
#> emp_length_num_woe 0.45522 1.66172 0.274 0.784127
#> home_ownership_woe 0.44072 0.19420 2.269 0.023244 *
#> dti_woe 0.65359 0.10061 6.496 8.23e-11 ***
#> purpose_woe 0.56190 0.15509 3.623 0.000291 ***
#> payment_inc_ratio_woe 0.71880 0.08655 8.305 < 2e-16 ***
#> delinq_2yrs_woe 151.66043 2160.39968 0.070 0.944034
#> delinq_2yrs_zero_woe -153.42792 2204.84728 -0.070 0.944523
#> inq_last_6mths_woe 0.67703 0.13302 5.090 3.59e-07 ***
#> last_delinq_none_woe -6.81050 2.62895 -2.591 0.009582 **
#> last_major_derog_none_woe 5.07937 2.63502 1.928 0.053900 .
#> open_acc_woe 1.44641 0.53928 2.682 0.007316 **
#> pub_rec_woe 5.33259 3.64037 1.465 0.142962
#> pub_rec_zero_woe NA NA NA NA
#> revol_util_woe 0.28205 0.08855 3.185 0.001447 **
#> groupkm NA NA NA NA
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 11953 on 13825 degrees of freedom
#> Residual deviance: 11662 on 13807 degrees of freedom
#> AIC: 11700
#>
#> Number of Fisher Scoring iterations: 11
# Predictions on training and test data
trainpred_seg2 <- predict(logit_Seg2, train2, type = 'response')
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
testpred_seg2 <- predict(logit_Seg2, test2, type = 'response')
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
test2$testpred_seg2 <- testpred_seg2
train2$trainpred_seg2 <- trainpred_seg2
# Optimal cutoff determination
pred2_seg <- prediction(trainpred_seg2, train2$bad_loans)
performseg2 <- performance(pred2_seg, "acc")
max2 <- which.max(slot(performseg2, "y.values")[[1]])
probseg2 <- slot(performseg2, "x.values")[[1]][max2]
probseg2
#> 5979
#> 0.3734171
# AUC calculation for training data
auc_binseg2 <- performance(pred2_seg, "auc")
auc_binseg2 <- unlist(slot(auc_binseg2, "y.values"))
auc_binseg2
#> [1] 0.6109219
# Confusion Matrix - Training Data
trainseg2_pred_class <- ifelse(trainpred_seg2 > prob1, 1, 0)
tble_trainseg2 <- table(Predicted = trainseg2_pred_class, Actual = train2$bad_loans)
tble_trainseg2
#> Actual
#> Predicted 0 1
#> 0 11675 2151
# ROC and AUC - Training Data
roc_trainseg2 <- roc(train2$bad_loans, trainpred_seg2)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
roc_testseg2 <- roc(test2$bad_loans, testpred_seg2)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
# AUC Calculation
auc_trainseg2 <- auc(roc_trainseg2)
auc_testseg2 <- auc(roc_testseg2)
auc_trainseg2
#> Area under the curve: 0.6109
auc_testseg2
#> Area under the curve: 0.6038
hist(trainpred_seg2)
segment3<- df_train[df_train$groupkm==3,]
segment3 <- segment3[,1:21]
set.seed(3333)
index3 <- sample(2,size = nrow(segment3), replace = T , prob=c(0.7,0.3))
train3 <- segment3[index3 == 1, ]
test3 <- segment3[index3 == 2, ]
logit_Seg3 <- glm(formula = bad_loans ~ ., family = stats::binomial("logit"), data = train3)
summary(logit_Seg3)
#>
#> Call:
#> glm(formula = bad_loans ~ ., family = stats::binomial("logit"),
#> data = train3)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -1.3902 -0.8672 -0.7407 1.3266 2.0066
#>
#> Coefficients: (3 not defined because of singularities)
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -1.40018 0.05227 -26.790 < 2e-16 ***
#> loan_amnt_woe 1.77443 1.11187 1.596 0.11051
#> funded_amnt_woe -2.10345 1.12636 -1.867 0.06184 .
#> grade_woe 0.76927 0.08293 9.276 < 2e-16 ***
#> sub_grade_num_woe 2.56629 0.84299 3.044 0.00233 **
#> short_emp_woe -0.80541 1.65932 -0.485 0.62740
#> emp_length_num_woe 1.41109 1.62053 0.871 0.38388
#> home_ownership_woe 0.51826 0.18548 2.794 0.00520 **
#> dti_woe 0.43239 0.08990 4.810 1.51e-06 ***
#> purpose_woe 0.72266 0.15089 4.789 1.67e-06 ***
#> payment_inc_ratio_woe 0.80587 0.06940 11.613 < 2e-16 ***
#> delinq_2yrs_woe 0.57485 0.92722 0.620 0.53528
#> delinq_2yrs_zero_woe NA NA NA NA
#> inq_last_6mths_woe 0.62154 0.12073 5.148 2.63e-07 ***
#> last_delinq_none_woe -6.28019 2.47936 -2.533 0.01131 *
#> last_major_derog_none_woe -0.22212 2.32830 -0.095 0.92400
#> open_acc_woe 0.57507 0.50910 1.130 0.25865
#> pub_rec_woe 2.82908 3.42105 0.827 0.40826
#> pub_rec_zero_woe NA NA NA NA
#> revol_util_woe 0.29099 0.09412 3.092 0.00199 **
#> groupkm NA NA NA NA
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 11828 on 9635 degrees of freedom
#> Residual deviance: 11502 on 9618 degrees of freedom
#> AIC: 11538
#>
#> Number of Fisher Scoring iterations: 4
# Predictions on training and test data
trainpred_seg3 <- predict(logit_Seg3, train3, type = 'response')
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
testpred_seg3 <- predict(logit_Seg3, test3, type = 'response')
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
test3$testpred_seg3 <- testpred_seg3
train3$trainpred_seg3 <- trainpred_seg3
# Optimal cutoff determination
pred3_seg <- prediction(trainpred_seg3, train3$bad_loans)
performseg3 <- performance(pred3_seg, "acc")
max3 <- which.max(slot(performseg3, "y.values")[[1]])
probseg3 <- slot(performseg3, "x.values")[[1]][max3]
probseg3
#> 6143
#> 0.4546298
# AUC calculation for training data
auc_binseg3 <- performance(pred3_seg, "auc")
auc_binseg3 <- unlist(slot(auc_binseg3, "y.values"))
auc_binseg3
#> [1] 0.6112667
# Confusion Matrix - Training Data
trainseg3_pred_class <- ifelse(trainpred_seg3 > prob1, 1, 0)
tble_trainseg3 <- table(Predicted = trainseg3_pred_class, Actual = train3$bad_loans)
tble_trainseg3
#> Actual
#> Predicted 0 1
#> 0 6582 2777
#> 1 130 147
# ROC and AUC - Training Data
roc_trainseg3 <- roc(train3$bad_loans, trainpred_seg3)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
roc_testseg3 <- roc(test3$bad_loans, testpred_seg3)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
# Plot ROC Curve
ggroc(roc_trainseg3) +
ggtitle("ROC Curve for Trainingseg3 Data") +
xlab("1 - Specificityseg3") +
ylab("Sensitivityseg3")
ggroc(roc_testseg3) +
ggtitle("ROC Curve for Testseg3 Data") +
xlab("1 - Specificityseg3") +
ylab("Sensitivityseg3")
# AUC Calculation
auc_trainseg3 <- auc(roc_trainseg3)
auc_testseg3 <- auc(roc_testseg3)
auc_trainseg3
#> Area under the curve: 0.6113
auc_testseg3
#> Area under the curve: 0.6179
count_seg1 <- as.numeric(count(segment1))
count_seg2 <- as.numeric(count(segment2))
count_seg3 <- as.numeric(count(segment3))
# Create a data frame with the accuracy metrics for each group
results_logitmodel <- data.frame(
Group = c("Group 1", "Group 2", "Group 3"),
AUC_Train = c(auc_trainseg, auc_trainseg2, auc_trainseg3),
AUC_Test = c(auc_testseg, auc_testseg2, auc_testseg3),
count = c(count_seg1,count_seg2,count_seg3)
)
# Print the results table
print(results_logitmodel)
#> Group AUC_Train AUC_Test count
#> 1 Group 1 0.6562001 0.6175226 7309
#> 2 Group 2 0.6109219 0.6037607 19836
#> 3 Group 3 0.6112667 0.6179009 13724
# Convert data to DMatrix format for XGBoost
trainxg_matrix1 <- xgb.DMatrix(data = as.matrix(train1[, -1]), label = train1$bad_loans)
testxg_matrix1 <- xgb.DMatrix(data = as.matrix(test1[, -1]), label = test1$bad_loans)
# Set parameters for XGBoost
params <- list(objective = "binary:logistic", eval_metric = "auc")
# Train the model
xgb_model1 <- xgboost(params = params, data = trainxg_matrix1, nrounds = 100, verbose = 0)
# Make predictions
train_pred_xgb1 <- predict(xgb_model1, trainxg_matrix1)
# Calculate AUC for train and test data
auc_train_xgb1 <- auc(train1$bad_loans, train_pred_xgb1)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
# Confusion Matrix and Metrics
train_pred_class_xgb1 <- ifelse(train_pred_xgb1 > 0.5, 1, 0)
# Training metrics
confusion_matrix_train_xgb1 <- table(Predicted = train_pred_class_xgb1, Actual = train1$bad_loans)
TP_train_xgb1 <- confusion_matrix_train_xgb1["1", "1"]
FP_train_xgb1 <- confusion_matrix_train_xgb1["1", "0"]
FN_train_xgb1 <- confusion_matrix_train_xgb1["0", "1"]
TN_train_xgb1 <- confusion_matrix_train_xgb1["0", "0"]
Specificity_train_xgb1 <- TN_train_xgb1 / (TN_train_xgb1 + FP_train_xgb1)
Sensitivity_train_xgb1 <- TP_train_xgb1 / (TP_train_xgb1 + FN_train_xgb1)
Precision_train_xgb1 <- TP_train_xgb1 / (TP_train_xgb1 + FP_train_xgb1)
# Convert data to DMatrix format for XGBoost
trainxg_matrix2 <- xgb.DMatrix(data = as.matrix(train2[, -1]), label = train2$bad_loans)
testxg_matrix2 <- xgb.DMatrix(data = as.matrix(test2[, -1]), label = test2$bad_loans)
# Set parameters for XGBoost
params <- list(objective = "binary:logistic", eval_metric = "auc")
# Train the model
xgb_model2 <- xgboost(params = params, data = trainxg_matrix2, nrounds = 100, verbose = 0)
# Make predictions
train_pred_xgb2 <- predict(xgb_model2, trainxg_matrix2)
# Calculate AUC for train and test data
auc_train_xgb2 <- auc(train2$bad_loans, train_pred_xgb2)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
# Confusion Matrix and Metrics
train_pred_class_xgb2 <- ifelse(train_pred_xgb2 > 0.5, 1, 0)
# Training metrics
confusion_matrix_train_xgb2 <- table(Predicted = train_pred_class_xgb2, Actual = train2$bad_loans)
TP_train_xgb2 <- confusion_matrix_train_xgb2["1", "1"]
FP_train_xgb2 <- confusion_matrix_train_xgb2["1", "0"]
FN_train_xgb2 <- confusion_matrix_train_xgb2["0", "1"]
TN_train_xgb2 <- confusion_matrix_train_xgb2["0", "0"]
Specificity_train_xgb2 <- TN_train_xgb2 / (TN_train_xgb2 + FP_train_xgb2)
Sensitivity_train_xgb2 <- TP_train_xgb2 / (TP_train_xgb2 + FN_train_xgb2)
Precision_train_xgb2 <- TP_train_xgb2 / (TP_train_xgb2 + FP_train_xgb2)
# Convert data to DMatrix format for XGBoost
trainxg_matrix3 <- xgb.DMatrix(data = as.matrix(train3[, -1]), label = train3$bad_loans)
testxg_matrix3 <- xgb.DMatrix(data = as.matrix(test3[, -1]), label = test3$bad_loans)
# Set parameters for XGBoost
params <- list(objective = "binary:logistic", eval_metric = "auc")
# Train the model
xgb_model3 <- xgboost(params = params, data = trainxg_matrix3, nrounds = 100, verbose = 0)
# Make predictions
train_pred_xgb3 <- predict(xgb_model3, trainxg_matrix3)
# Calculate AUC for train and test data
auc_train_xgb3 <- auc(train3$bad_loans, train_pred_xgb3)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
# Confusion Matrix and Metrics
train_pred_class_xgb3 <- ifelse(train_pred_xgb3 > 0.5, 1, 0)
# Training metrics
confusion_matrix_train_xgb3 <- table(Predicted = train_pred_class_xgb3, Actual = train3$bad_loans)
TP_train_xgb3 <- confusion_matrix_train_xgb3["1", "1"]
FP_train_xgb3 <- confusion_matrix_train_xgb3["1", "0"]
FN_train_xgb3 <- confusion_matrix_train_xgb3["0", "1"]
TN_train_xgb3 <- confusion_matrix_train_xgb3["0", "0"]
Specificity_train_xgb3 <- TN_train_xgb3 / (TN_train_xgb3 + FP_train_xgb3)
Sensitivity_train_xgb3 <- TP_train_xgb3 / (TP_train_xgb3 + FN_train_xgb3)
Precision_train_xgb3 <- TP_train_xgb3 / (TP_train_xgb3 + FP_train_xgb3)
# Create a data frame with the accuracy metrics for each group
results_xgb <- data.frame(
Group = c("Group 1", "Group 2", "Group 3"),
AUC_Train = c(auc_train_xgb1, auc_train_xgb2, auc_train_xgb3),
count = c(count_seg1,count_seg2,count_seg3)
)
# Print the results table
print(results_xgb)
#> Group AUC_Train count
#> 1 Group 1 0.9935793 7309
#> 2 Group 2 0.9105632 19836
#> 3 Group 3 0.9275514 13724
# Combine the results from both models into one data frame
combined_results_models <- data.frame(
Model = rep(c("logit", "XGBoost"), each = 3),
Group = rep(c("Group 1", "Group 2", "Group 3"), times = 2),
AUC_Train = c(auc_trainseg, auc_trainseg2, auc_trainseg3,
auc_train_xgb1, auc_train_xgb2, auc_train_xgb3),
Count = c(count_seg1, count_seg2, count_seg3)
)
# Print the combined results table
print(combined_results_models)
#> Model Group AUC_Train Count
#> 1 logit Group 1 0.6562001 7309
#> 2 logit Group 2 0.6109219 19836
#> 3 logit Group 3 0.6112667 13724
#> 4 XGBoost Group 1 0.9935793 7309
#> 5 XGBoost Group 2 0.9105632 19836
#> 6 XGBoost Group 3 0.9275514 13724
coefficients_logit <- coef(logit1)
print(coefficients_logit)
#> (Intercept) loan_amnt_woe funded_amnt_woe
#> -1.4499876 1.9404957 -2.2571601
#> grade_woe sub_grade_num_woe short_emp_woe
#> 0.8563656 3.2256056 -0.1367362
#> emp_length_num_woe home_ownership_woe dti_woe
#> 1.1087247 0.4312961 0.5602752
#> purpose_woe payment_inc_ratio_woe delinq_2yrs_woe
#> 0.7303181 0.7780070 132.4607330
#> delinq_2yrs_zero_woe inq_last_6mths_woe last_delinq_none_woe
#> -135.0128108 0.6906149 -3.1682093
#> last_major_derog_none_woe open_acc_woe pub_rec_woe
#> 3.8364974 0.9578692 6.1658912
#> pub_rec_zero_woe revol_util_woe
#> NA 0.3319224
weight_grade <- coefficients_logit['grade_woe']
weight_emp_length_num_woe <- coefficients_logit['emp_length_num_woe']
weight_loan_amnt_woe <- coefficients_logit['loan_amnt_woe']
weight_delinq_2yrs_zero_woe <- coefficients_logit['delinq_2yrs_zero_woe']
cluster_summary <- df1 %>%
group_by(group_GA) %>%
summarise(
grade_woe = mean(grade_woe),
emp_length_num_woe = mean(emp_length_num_woe),
loan_amnt_woe = mean(loan_amnt_woe),
groups = 'drop'
) %>%
mutate(
composite_score = grade_woe * weight_grade +
emp_length_num_woe * weight_emp_length_num_woe +
loan_amnt_woe * weight_loan_amnt_woe
) %>%
arrange(desc(composite_score))
print(cluster_summary)
#> # A tibble: 10 × 6
#> group_GA grade_woe emp_length_num_woe loan_amnt_woe groups composite_score
#> <int> <dbl> <dbl> <dbl> <chr> <dbl>
#> 1 7 0.630 -0.000535 0.139 drop 0.810
#> 2 1 0.860 -0.00511 0.0281 drop 0.786
#> 3 8 0.228 -0.00326 -0.0423 drop 0.110
#> 4 2 0.241 0.000562 -0.0525 drop 0.105
#> 5 5 -0.109 -0.000299 0.0841 drop 0.0696
#> 6 3 -0.319 0.000533 -0.0547 drop -0.379
#> 7 10 -0.319 -0.00410 -0.0536 drop -0.382
#> 8 6 -1.12 0.00564 0.0908 drop -0.779
#> 9 4 -1.12 -0.00178 -0.0718 drop -1.10
#> 10 9 -1.12 0.00313 -0.0905 drop -1.13